notebook.community

Edit and run



In [32]:

    
# imports
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Ploting Libs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

# For Model Evaluation
from sklearn.model_selection import cross_val_score

Load the titanic train and test data



In [33]:

    
# create the Dataframes from datasets
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')



In [34]:

    
# Display the first 5 rows
train.head()









    Out[34]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [35]:

    
# Display the Last 5 rows
train.tail()









    Out[35]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      886
      887
      0
      2
      Montvila, Rev. Juozas
      male
      27.0
      0
      0
      211536
      13.00
      NaN
      S
    
    
      887
      888
      1
      1
      Graham, Miss. Margaret Edith
      female
      19.0
      0
      0
      112053
      30.00
      B42
      S
    
    
      888
      889
      0
      3
      Johnston, Miss. Catherine Helen "Carrie"
      female
      NaN
      1
      2
      W./C. 6607
      23.45
      NaN
      S
    
    
      889
      890
      1
      1
      Behr, Mr. Karl Howell
      male
      26.0
      0
      0
      111369
      30.00
      C148
      C
    
    
      890
      891
      0
      3
      Dooley, Mr. Patrick
      male
      32.0
      0
      0
      370376
      7.75
      NaN
      Q



In [36]:

    
# See the shape of our datasets (rows, columns)
print(train.shape)
print(test.shape)









    



(891, 12)
(418, 11)



In [37]:

    
train.describe()









    Out[37]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      NaN
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      NaN
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      NaN
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [7]:

    
train.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [8]:

    
# Missing columns data
train.isnull().sum()









    Out[8]:





PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64



In [9]:

    
test.isnull().sum()









    Out[9]:





PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64



In [10]:

    
# Look for the most common value for Embarked
train['Embarked'].value_counts()









    Out[10]:





S    644
C    168
Q     77
Name: Embarked, dtype: int64



In [11]:

    
# Embarked

# From this we can see there're a lot of age columns with missing values
# Cabin has a lot of missing values, let's assume it doesn't play 
# a big rule in our predictions
train = train.drop(['PassengerId','Ticket','Cabin', 'Name'], axis=1)
test    = test.drop(['Ticket','Cabin', 'Name'], axis=1)

# fill the two missing values with the most occurred value (S)
train['Embarked'] = train['Embarked'].fillna("S")

# Set up the matplotlib figure
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

# Draw barplot to show survivors for Embarked considering Sex
sns.factorplot(x="Embarked", y="Survived", hue="Sex", data=train,
                   size=6, kind="bar")

sns.countplot(x='Embarked', data=train, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=train, order=[0,1], ax=axis2)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc25822bba8>



In [12]:

    
# Fare

# CLean Fare in the test dataset
# Fill the missing value
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# use int instead of float
train['Fare'] = train['Fare'].astype(int)
test['Fare'] = test['Fare'].astype(int)

# create plot
train['Fare'].plot(kind='hist', figsize=(10,5),bins=100, xlim=(0,50))









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc25a62c198>



In [13]:

    
# Age

# fill "NaN" values in empty Age columns
train['Age'][np.isnan(train['Age'])] = train['Age'].mean()
test['Age'][np.isnan(test['Age'])] = test['Age'].mean()

# Now that we haven't missing values we can
# use Age as int instead float
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)

# plot the distribuition of people by age
train['Age'].plot(kind='hist', bins=50)









    



/home/denisovitch/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/denisovitch/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc2581a18d0>



In [14]:

    
# Sex 

fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))

# How many people survived Vs Died
#sns.countplot(x="Survived", data=train, palette="muted")
sns.countplot(x='Survived', data=train, order=[0,1], ax=axis1)
axis1.set_ylabel('Frequency')

# Survived people by their gender
#sns.countplot(x="Survived", hue='Sex', data=train, palette="muted")
sns.countplot(x='Survived', hue="Sex", data=train, order=[0,1], ax=axis2)
axis2.set_ylabel("Frequency")

axis1.set_xticklabels(['Survived', 'Died'], rotation=0)
axis2.set_xticklabels(['Survived', 'Died'], rotation=0)









    Out[14]:





[<matplotlib.text.Text at 0x7fc257c424a8>,
 <matplotlib.text.Text at 0x7fc257c3a7f0>]



In [15]:

    
# Transforming categorical data to numeric data for our machine learning model
# Current categorical data: Sex, Embarked

# Sex
train.loc[train["Sex"] == "male", "Sex"] = 0 
train.loc[train["Sex"] == "female", "Sex"] = 1

test.loc[test["Sex"] == "male", "Sex"] = 0 
test.loc[test["Sex"] == "female", "Sex"] = 1

# Embarked
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2

test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2

train.head()



In [16]:

    
# Now we're ready to build the machine learning model
X = train.drop(['Survived'], axis=1) # instances to learn from
y = train['Survived'] # target/responses the model is trying to learn to predict



In [22]:

    
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

# Make predictions using the test set.
model.fit(X, y)

# Evaluate acuracy score of the Decision Tree Classifier
# We got 78% with this model
print(cross_val_score(model, X, y, cv=10, scoring='accuracy').mean())









    



0.789121268868

Make Predictions



In [23]:

    
test.head()









    Out[23]:






  
    
      
      PassengerId
      Pclass
      Sex
      Age
      SibSp
      Parch
      Fare
      Embarked
    
  
  
    
      0
      892
      3
      0
      34
      0
      0
      7
      2
    
    
      1
      893
      3
      1
      47
      1
      0
      7
      0
    
    
      2
      894
      2
      0
      62
      0
      0
      9
      2
    
    
      3
      895
      3
      0
      27
      0
      0
      8
      0
    
    
      4
      896
      3
      1
      22
      1
      1
      12
      0



In [24]:

    
predictions = model.predict(test.drop(['PassengerId'], axis=1))

Create Submissionn File



In [38]:

    
# Create a new dataframe
submission = DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv('submission_dt', index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.00	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.00	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.45	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.00	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.75	NaN	Q

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	NaN	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	NaN	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	NaN	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	892	3	0	34	0	0	7	2
1	893	3	1	47	1	0	7	0
2	894	2	0	62	0	0	9	2
3	895	3	0	27	0	0	8	0
4	896	3	1	22	1	1	12	0

	PassengerId	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	892	3	0	34	0	0	7	2
1	893	3	1	47	1	0	7	0
2	894	2	0	62	0	0	9	2
3	895	3	0	27	0	0	8	0
4	896	3	1	22	1	1	12	0

	PassengerId	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	892	3	0	34	0	0	7	2
1	893	3	1	47	1	0	7	0
2	894	2	0	62	0	0	9	2
3	895	3	0	27	0	0	8	0
4	896	3	1	22	1	1	12	0